Libraries

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(knitr)
library(cowplot) # for plot_grid and draw_label
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
library(RColorBrewer)
library(pander) # to make session info
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(snakecase)
library(corrr)

Config

inputs_dir <- "inputs"
file_pattern <- "survey_data"
data_file <- tibble(files = list.files(inputs_dir, file_pattern)) %>%
  arrange(desc(files)) %>%
  top_n(1, wt = files) %>%
  pull(files)

print(data_file)
## [1] "survey_data.2020-07-29_11.35.40.txt"

Settings

min_genes_gt_0 <- 100

Functions

is_outlier <- function(x) {
  (x > quantile(x, 0.75) + 1.5*IQR(x)) |
    (x < quantile(x, 0.25) - 1.5*IQR(x))
}


most_common_val_w_percent <- function(value_vector = c("a", "a", "b")){
  mc_vals <- tabyl(value_vector) %>%
    arrange(desc(n)) %>%
    top_n(1, n) 
  paste0(round(100*mc_vals$percent), "% ", mc_vals$value_vector)
}

# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
  "StatBin2", 
  StatBin,
  compute_group = function (data, scales, binwidth = NULL, bins = NULL, 
                            center = NULL, boundary = NULL, 
                            closed = c("right", "left"), pad = FALSE, 
                            breaks = NULL, origin = NULL, right = NULL, 
                            drop = NULL, width = NULL) {
    if (!is.null(breaks)) {
      if (!scales$x$is_discrete()) {
        breaks <- scales$x$transform(breaks)
      }
      bins <- ggplot2:::bin_breaks(breaks, closed)
    }
    else if (!is.null(binwidth)) {
      if (is.function(binwidth)) {
        binwidth <- binwidth(data$x)
      }
      bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth, 
                                         center = center, boundary = boundary, 
                                         closed = closed)
    }
    else {
      bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins, 
                                        center = center, boundary = boundary, 
                                        closed = closed)
    }
    res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)

    # drop 0-count bins completely before returning the dataframe
    res <- res[res$count > 0, ] 

    res
  })

Import data

col_spec <- cols(
  .default = col_double(),
  dataset_id = col_character(),
  disease = col_character(),
  pedaya = col_character(),
  race = col_character(),
  ethnicity = col_character(),
  gender = col_character(),
  cohort_name = col_character(),
  cohort_code = col_character(),
  doi_link = col_character(),
  source_repository = col_character(),
  repository_cohort_accession = col_character(),
  repository_dataset_accession = col_character()
)

counts_and_meta_raw <- read_tsv(file.path(inputs_dir, data_file), col_types = col_spec)

counts_and_meta <- counts_and_meta_raw %>%
  group_by(cohort_code) %>%
  mutate(n_in_cohort = n())

write_csv(counts_and_meta, "results/Table_S1.csv")

Colors

# brewer.pal(8, "Paired")

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")

# light blue, dark blue, etc: green, red, orange (then purple and brown)

these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C", 
"#FF7F00", "#FF7F00")

names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")

Datasets analyzed

nrow(counts_and_meta)
## [1] 2179

Diseases in dataset

counts_and_meta %>% tabyl(disease)
##                                     disease   n      percent
##                acute lymphoblastic leukemia 680 0.3120697568
##             acute megakaryoblastic leukemia 103 0.0472693896
##                      acute myeloid leukemia 221 0.1014226709
##             acute undifferentiated leukemia   1 0.0004589261
##                      adrenocortical adenoma   1 0.0004589261
##                       adrenocortical cancer   2 0.0009178522
##                    adrenocortical carcinoma  20 0.0091785223
##                   alveolar rhabdomyosarcoma  40 0.0183570445
##                          cholangiocarcinoma   1 0.0004589261
##                    choroid plexus carcinoma  25 0.0114731528
##         desmoplastic small round cell tumor   4 0.0018357045
##      dysembryoplastic neuroepithelial tumor  13 0.0059660395
##                  embryonal rhabdomyosarcoma  42 0.0192748967
##  embryonal tumor with multilayered rosettes   1 0.0004589261
##                                  ependymoma  98 0.0449747591
##                         epithelioid sarcoma   1 0.0004589261
##                               Ewing sarcoma  70 0.0321248279
##      fibrolamellar hepatocellular carcinoma   3 0.0013767783
##                                fibromatosis   1 0.0004589261
##              gastrointestinal stromal tumor   4 0.0018357045
##                             germ cell tumor   1 0.0004589261
##                     glioblastoma multiforme  29 0.0133088573
##                                      glioma 193 0.0885727398
##                              hepatoblastoma   1 0.0004589261
##                    hepatocellular carcinoma   3 0.0013767783
##                 hypereosinophillic syndrome   1 0.0004589261
##            juvenile myelomonocytic leukemia   1 0.0004589261
##                                    leukemia   1 0.0004589261
##                                    lymphoma  49 0.0224873795
##                             medulloblastoma 201 0.0922441487
##                                    melanoma   7 0.0032124828
##             melanotic neuroectodermal tumor   1 0.0004589261
##                                  meningioma   1 0.0004589261
##                             myofibromatosis   2 0.0009178522
##                    nasopharyngeal carcinoma   2 0.0009178522
##                               neuroblastoma  12 0.0055071134
##                    neuroendocrine carcinoma   1 0.0004589261
##                                not reported   2 0.0009178522
##                not reported - QC fail by PI   1 0.0004589261
##                                osteosarcoma 157 0.0720513997
##           ovarian serous cystadenocarcinoma   1 0.0004589261
##                                      PEComa   1 0.0004589261
##                    pleuropulmonary blastoma   1 0.0004589261
##                              retinoblastoma   2 0.0009178522
##                              rhabdoid tumor  65 0.0298301973
##                            rhabdomyosarcoma  53 0.0243230840
##    spindle cell/sclerosing rhabdomyosarcoma   3 0.0013767783
##          supratentorial embryonal tumor NOS  18 0.0082606700
##                            synovial sarcoma  22 0.0100963745
##                undifferentiated sarcoma NOS   3 0.0013767783
##       undifferentiated spindle cell sarcoma   2 0.0009178522
##                                 wilms tumor  11 0.0050481872
disease_freq_table <- counts_and_meta$disease %>%
  str_to_sentence %>%
  str_replace(" nos$", " NOS") %>%
  as.factor %>%
  fct_infreq %>%
  fct_lump(prop=0.01) %>%
  tabyl (var1 = "Disease") %>%
  adorn_pct_formatting(digits = 1)

colnames(disease_freq_table)[1]="Disease"

write_tsv(disease_freq_table, "results/disease_freq_table.tsv")
write_csv(disease_freq_table, "results/disease_freq_table.csv")

disease_freq_table  %>%
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Disease n percent
Acute lymphoblastic leukemia 680 31.2%
Acute myeloid leukemia 221 10.1%
Medulloblastoma 201 9.2%
Glioma 193 8.9%
Osteosarcoma 157 7.2%
Acute megakaryoblastic leukemia 103 4.7%
Ependymoma 98 4.5%
Ewing sarcoma 70 3.2%
Rhabdoid tumor 65 3.0%
Rhabdomyosarcoma 53 2.4%
Lymphoma 49 2.2%
Embryonal rhabdomyosarcoma 42 1.9%
Alveolar rhabdomyosarcoma 40 1.8%
Glioblastoma multiforme 29 1.3%
Choroid plexus carcinoma 25 1.1%
Synovial sarcoma 22 1.0%
Other 131 6.0%
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
## 
## FALSE  TRUE 
##  1172  1007

Cohorts in dataset

old_cohort_size_histogram <- ggplot(counts_and_meta) + 
  geom_histogram(aes(n_in_cohort, group=cohort_code), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  xlab("Cohort size") +
  ylab("Cohorts") + 
  theme(legend.position="none")


# max(table(counts_and_meta$n_in_cohort)/as.numeric(names(table(counts_and_meta$n_in_cohort))))
cohort_hist_bin_size <- 10


cohort_size_histogram_raw <- ggplot(counts_and_meta %>%
  select(cohort_code, n_in_cohort) %>%
  distinct ) + 
  geom_histogram(aes(n_in_cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 breaks = seq(0,500, by=10)) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  
  xlab("Cohort size") +
  theme(legend.position="none")

max_y_val <- max(ggplot_build(cohort_size_histogram_raw)$data[[1]][,"y"])


cohort_size_histogram <- cohort_size_histogram_raw + 
  scale_y_continuous("Cohorts", breaks = seq(0, ceiling(max_y_val/2) * 2, 2)) 
  

cohort_size_histogram

n_distinct(counts_and_meta$cohort_code)
## [1] 48
sort(unique(counts_and_meta$n_in_cohort))
##  [1]   3   4   6   7   8  10  15  17  19  22  23  24  25  26  31  35  39
## [18]  43  44  54  56  62  63  65  78  82  84  88  89 103 127 137 141 337
counts_and_meta %>%
  select(cohort_code, n_in_cohort) %>%
  distinct %>%
  arrange(desc(n_in_cohort)) %>%
  kable
cohort_code n_in_cohort
Cohort_1 337
Cohort_2 141
Cohort_3 137
Cohort_4 127
Cohort_5 103
Cohort_6 89
Cohort_7 88
Cohort_8 84
Cohort_9 82
Cohort_10 78
Cohort_11 65
Cohort_12 63
Cohort_13 62
Cohort_14 56
Cohort_15 54
Cohort_16 44
Cohort_17 43
Cohort_18 39
Cohort_19 35
Cohort_20 31
Cohort_21 31
Cohort_22 26
Cohort_23 25
Cohort_24 25
Cohort_26 24
Cohort_25 24
Cohort_30 23
Cohort_27 23
Cohort_28 23
Cohort_29 23
Cohort_31 22
Cohort_32 19
Cohort_33 19
Cohort_34 17
Cohort_35 15
Cohort_36 10
Cohort_37 8
Cohort_38 8
Cohort_41 7
Cohort_39 7
Cohort_40 7
Cohort_42 6
Cohort_44 6
Cohort_43 6
Cohort_45 6
Cohort_47 4
Cohort_46 4
Cohort_48 3
counts_and_meta %>%
  select(cohort_code, n_in_cohort) %>%
  distinct %>%
  pull(n_in_cohort) %>%
  median
## [1] 24.5

Table S2 - repositories

repo_info <- read_tsv("inputs/repos.tsv")
## Parsed with column specification:
## cols(
##   repo_name = col_character(),
##   repo_abbrev = col_character(),
##   repo_url = col_character(),
##   source_accession_pattern = col_character()
## )
repo_table <- repo_info %>%
  filter(repo_abbrev %in% counts_and_meta$source_repository) %>%
  rename(`Name`=repo_name, `Abbreviation`=repo_abbrev, URL = repo_url) %>%
  select(-source_accession_pattern) 

write_tsv(repo_table, "results/Table_S2_repo_table.tsv")
write_csv(repo_table, "results/Table_S2_repo_table.csv")

repo_table %>%
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Name Abbreviation URL
Short Read Archive SRA https://www.ncbi.nlm.nih.gov/sra
European Genome-phenome Archive EGA https://www.ebi.ac.uk/ega/home
St. Jude Cloud SJC https://www.stjude.cloud/
Database of Genotypes and Phenotypes dbGaP https://www.ncbi.nlm.nih.gov/gap/
Cavatica Cavatica https://cavatica.squarespace.com/

Table S3 - data access table

cohort_table <- counts_and_meta %>% 
  select(cohort_code, cohort_name, source_repository, repository_cohort_accession, n_in_cohort) %>% # removed DOI link
#  group_by(across(c(-disease)))
  distinct %>%
  arrange(desc(n_in_cohort)) 

colnames(cohort_table) <- to_any_case(colnames(cohort_table), case = "sentence")

write_tsv(cohort_table, "results/Table_S3_cohort_table.tsv")
write_csv(cohort_table, "results/Table_S3_cohort_table.csv")




cohort_table %>% 
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Cohort code Cohort name Source repository Repository cohort accession N in cohort
Cohort_1 TARGET-10 dbGaP phs000218 337
Cohort_2 SJC_BALL SJC SJC-DS-1001 141
Cohort_3 TARGET-20 dbGaP phs000218 137
Cohort_4 EGAD00001003279 EGA EGAD00001003279 127
Cohort_5 SJC_AMLM SJC SJC-DS-1001 103
Cohort_6 phs000673 dbGaP phs000673 89
Cohort_7 TARGET-40 dbGaP phs000218 88
Cohort_8 phs000720 dbGaP phs000720 84
Cohort_9 SJC_HGG SJC SJC-DS-1001 82
Cohort_10 SJC_EPD SJC SJC-DS-1001 78
Cohort_11 TARGET-52 dbGaP phs000218 65
Cohort_12 EGAD00001001098 EGA EGAD00001001098 63
Cohort_13 phs000768 dbGaP phs000768 62
Cohort_14 SJC_ETV SJC SJC-DS-1001 56
Cohort_15 SJC_LGG SJC SJC-DS-1001 54
Cohort_16 SJC_CBF SJC SJC-DS-1001 44
Cohort_17 SJC_RHB SJC SJC-DS-1001 43
Cohort_18 EGAD00001001620 EGA EGAD00001001620 39
Cohort_19 phs000699 dbGaP phs000699 35
Cohort_20 CBTTC Cavatica CBTTC 31
Cohort_21 SJC_ERG SJC SJC-DS-1001 31
Cohort_22 SJC_PHALL SJC SJC-DS-1001 26
Cohort_23 EGAD00001001666 EGA EGAD00001001666 25
Cohort_24 SJC_CPC SJC SJC-DS-1001 25
Cohort_26 phs000900 dbGaP phs000900 24
Cohort_25 EGAD00001000648 EGA EGAD00001000648 24
Cohort_30 TARGET-21 dbGaP phs000218 23
Cohort_27 EGAD00001000356 EGA EGAD00001000356 23
Cohort_28 EGAD00001000617 EGA EGAD00001000617 23
Cohort_29 SJC_OS SJC SJC-DS-1001 23
Cohort_31 EGAD00001001927 EGA EGAD00001001927 22
Cohort_32 EGAD00001002680 EGA EGAD00001002680 19
Cohort_33 SRP126664 SRA SRP126664 19
Cohort_34 EGAD00001000158 EGA EGAD00001000158 17
Cohort_35 SRP092501 SRA SRP092501 15
Cohort_36 EGAD00001000826 EGA EGAD00001000826 10
Cohort_37 SJC_E SJC SJC-DS-1001 8
Cohort_38 SJC_MB SJC SJC-DS-1001 8
Cohort_41 TARGET-30 dbGaP phs000218 7
Cohort_39 SJC_HYPO SJC SJC-DS-1001 7
Cohort_40 SJC_MEL SJC SJC-DS-1001 7
Cohort_42 EGAD00001000328 EGA EGAD00001000328 6
Cohort_44 SJC_Other SJC SJC-DS-1001 6
Cohort_43 SJC_INF SJC SJC-DS-1001 6
Cohort_45 SJC_WLM SJC SJC-DS-1001 6
Cohort_47 TARGET-50 dbGaP phs000218 4
Cohort_46 SRP006575 SRA SRP006575 4
Cohort_48 SRP040454 SRA SRP040454 3

Race and gender in dataset

counts_and_meta %>%
  tabyl(race)
##                                       race    n      percent valid_percent
##                                      Asian   27 0.0123910050   0.044850498
##                     Black/African American   70 0.0321248279   0.116279070
##  Native Hawaiian or Other Pacific Islander    3 0.0013767783   0.004983389
##         not-white (no other info provided)    1 0.0004589261   0.001661130
##                                      Other    7 0.0032124828   0.011627907
##                                      White  494 0.2267094998   0.820598007
##                                       <NA> 1577 0.7237264800            NA
counts_and_meta %>%
  select(race) %>%
  na.omit() %>%
  tabyl(race) %>%
  adorn_totals("row")
## Adding missing grouping variables: `cohort_code`
##                                       race   n     percent
##                                      Asian  27 0.044850498
##                     Black/African American  70 0.116279070
##  Native Hawaiian or Other Pacific Islander   3 0.004983389
##         not-white (no other info provided)   1 0.001661130
##                                      Other   7 0.011627907
##                                      White 494 0.820598007
##                                      Total 602 1.000000000
counts_and_meta %>%
    select(ethnicity) %>%
  na.omit() %>%
tabyl(ethnicity) %>%
  adorn_totals("row")
## Adding missing grouping variables: `cohort_code`
##               ethnicity   n   percent
##      Hispanic or Latino 128 0.1486643
##  Not Hispanic or Latino 733 0.8513357
##                   Total 861 1.0000000
counts_and_meta %>%
  filter(ethnicity == "Hispanic or Latino") %>%
  tabyl(race)
##                    race  n   percent valid_percent
##  Black/African American  1 0.0078125    0.01123596
##                   Other  3 0.0234375    0.03370787
##                   White 85 0.6640625    0.95505618
##                    <NA> 39 0.3046875            NA
counts_and_meta %>%
  filter(ethnicity == "Hispanic or Latino") %>%
  select(race) %>%
  na.omit() %>%
tabyl(race) %>%
  adorn_totals("row")
## Adding missing grouping variables: `cohort_code`
##                    race  n    percent
##  Black/African American  1 0.01123596
##                   Other  3 0.03370787
##                   White 85 0.95505618
##                   Total 89 1.00000000

Genders in dataset

table(counts_and_meta$gender, useNA = "always")
## 
## female   male   <NA> 
##    709    983    487
# samples with reported genders
counts_and_meta %>% 
  filter(! is.na(gender)) %>%
  filter(! gender %in% c("not reported", "unknown")) %>%
  tabyl(gender) %>%
  adorn_totals
##  gender    n   percent
##  female  709 0.4190307
##    male  983 0.5809693
##   Total 1692 1.0000000

Ages in dataset

table(counts_and_meta$pedaya, useNA = "always")
## 
##                  No Yes, age < 30 years                <NA> 
##                  67                2088                  24
# at least one target sample is >30

n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | counts_and_meta$age_at_dx<30, na.rm = TRUE)
n_ped_aya
## [1] 2088
n_ped_aya/nrow(counts_and_meta)
## [1] 0.9582377
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7

median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 8.32
table(is.na(counts_and_meta$age_at_dx))
## 
## FALSE  TRUE 
##  1472   707

Read lengths in dataset

seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
##   25   50   75  100  125 <NA> 
##    4  238  366 1544   27    0
tabyl(seq_len_round_25)
##  seq_len_round_25    n     percent
##                25    4 0.001835704
##                50  238 0.109224415
##                75  366 0.167966957
##               100 1544 0.708581918
##               125   27 0.012391005
table(is.na(counts_and_meta$seq_length))
## 
## FALSE 
##  2179
table(counts_and_meta$seq_length)
## 
##   36   50   51   75   76   81  100  101  110  125 
##    4   11  227  279   40   47   98 1431   15   27
summary(counts_and_meta$seq_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   36.00   76.00  101.00   91.51  101.00  125.00
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 25
 old_read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length, group=cohort_code), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets")
 
 read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 binwidth = 1) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets") +
   expand_limits(x=0)
  
read_length_histogram

# Figure 1

figure_name <- "f1"



f1 <- plot_grid(cohort_size_histogram,
read_length_histogram,
nrow=2,
labels = c("A", "B")) +
   draw_label(paste(figure_name, Sys.time()),
              x = 0, y = 0.02, hjust = 0, size = 6) 

f1

ggsave("results/Figure_1.png", f1, height=6, width=5)
ggsave("results/Figure_1.pdf", f1, height=6, width=5)

Check for counting errors

if(nrow(subset(counts_and_meta, Mapped > Total_reads)) > 0) stop("Counting error")
if(nrow(subset(counts_and_meta, MND > Mapped)) > 0) stop("Counting error")
if(nrow(subset(counts_and_meta, MEND > MND)) > 0) stop("Counting error")

Summarize total read counts

min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>% 
  ungroup %>%
  mutate(dataset_value = Total_reads/1e6) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(Total_read_counts %>%
        select(-variance, -p25, -p75), digits = 1)
min max median iqr
0.2 668 61.2 53.3
total_read_count_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(Total_reads/1E6), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Total reads") +
    ylab("Datasets")

total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate percentages

 read_counts_with_read_type_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
    pct_dupe_of_mapped = (Mapped - MND) / Mapped,
    pct_non_exonic_of_non_dupe = (MND - MEND) / MND
  )

read_type_fractions_long <- read_counts_with_read_type_fractions %>%
  pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value") 

Summarize read type fractions

comparison_of_distributions <- read_type_fractions_long %>% 
  group_by(read_type_fraction_name) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(comparison_of_distributions %>%
      mutate_if(is.double, ~100*.), digits = 3) %>% 
  kable_styling(bootstrap_options = "striped", full_width = T)
read_type_fraction_name variance min max median p25 p75 iqr
pct_dupe_of_mapped 5.884 2.853 99.997 26.887 13.450 43.033 29.583
pct_non_exonic_of_non_dupe 4.094 4.495 97.000 24.769 16.189 37.277 21.088
pct_unmapped_of_total 0.605 0.710 76.677 3.414 2.740 6.006 3.266
kable(comparison_of_distributions %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0) %>% 
  kable_styling(bootstrap_options = "striped", full_width = T)
read_type_fraction_name min max median iqr
pct_dupe_of_mapped 3 100 27 30
pct_non_exonic_of_non_dupe 4 97 25 21
pct_unmapped_of_total 1 77 3 3

statements about read type fractions

See comparison_of_distributions table above

unmapped

# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
## 
## FALSE  TRUE 
##  2102    77

duplicate

# 426 datasets have more than 50% duplicates (Fig 2A).
read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1753 0.8044975
##      TRUE  426 0.1955025
fiftypct <- read_counts_with_read_type_fractions %>%
  group_by(cohort_code) %>%
  summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
            median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)


fiftypct %>% tabyl(median_lt_50)
##  median_lt_50  n   percent
##         FALSE  7 0.1458333
##          TRUE 41 0.8541667
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
##  has_a_dataset_with_more_than_50pct_dupes  n percent
##                                     FALSE 15  0.3125
##                                      TRUE 33  0.6875
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n   percent
##                           FALSE 22 0.4583333
##                            TRUE 26 0.5416667

exonic

read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1849 0.8485544
##      TRUE  330 0.1514456
fiftypcte <- read_counts_with_read_type_fractions %>%
  group_by(cohort_code) %>%
  summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
            median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)


fiftypcte %>% tabyl(median_lt_50)
##  median_lt_50  n percent
##         FALSE  3  0.0625
##          TRUE 45  0.9375
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
##  has_a_dataset_with_more_than_50pct_ne  n percent
##                                  FALSE 36    0.75
##                                   TRUE 12    0.25
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n percent
##                           FALSE 39  0.8125
##                            TRUE  9  0.1875

Correlation of depth of sequencing to duplicate read fraction

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

Figure S1

Plot correlation of depth of sequencing to duplicate read fraction

this_data <- read_counts_with_read_type_fractions %>%
  select(Total_reads, pct_dupe_of_mapped)
## Adding missing grouping variables: `cohort_code`
this_plot_title <- "Correlation of Total_reads and % Duplicates"

these_stats <- this_data  %>%
  ungroup %>%
  summarize(r_corr = round(cor(Total_reads, pct_dupe_of_mapped), 2))

fs1 <- ggplot(this_data,
       aes(x=Total_reads/1E6, y=pct_dupe_of_mapped)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Total_reads/1E6),
             y=max(this_data$pct_dupe_of_mapped),
             hjust = 1, vjust = 1
             ) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none", plot.title=element_text(size=22)) +
  xlab("Read counts (million)") +
  ylab("% Duplicates")


ggsave("results/Figure_S1.png", fs1 + ggtitle(""))
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
ggsave("results/Figure_S1.pdf", fs1 + ggtitle(""))
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'

Generate schematic defining read types

colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))

colors_for_read_types <- c(colors_for_read_types, 
                           "Genome" = "darkblue",
                           "Exon" = "darkblue",
                           "Mapped reads" = "darkgrey")

plot1_colnames <- c("label", "x1", "x2", "y1", "y2")

read_type_schematic_data_as_vector <- c("Duplicate", 3, 4, 5, 6, 
                       "MEND", 3, 4, 4, 5, 
                       "MEND", 2.5, 3.5, 3, 4, 
                       "Non-exonic", 4.5, 5.5, 3, 4,
                       "Unmapped", 6, 7, 3, 4,
                       "Exon", 2.5, 4, 1, 2,
                       "Mapped reads", 2.8, 2.8, 6, 7)

rows_in_schematic <- length(read_type_schematic_data_as_vector)/length(plot1_colnames)

read_type_schematic_data <- matrix(read_type_schematic_data_as_vector, 
                     ncol = length(plot1_colnames), byrow = TRUE,
                     dimnames = list(c(1:rows_in_schematic), plot1_colnames)) %>%
  as_tibble %>%
  type_convert() %>%
  mutate(
    base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
    border_color = case_when(
      label %in% c("Mapped reads", "Genome", "MEND", "Exon") ~ NA_character_,
      TRUE ~ base_color),
    fill_color = ifelse(label %in% c("MEND", "Exon"),  base_color, "white"),
    text_color = case_when(
      label %in% c("MEND", "Exon")~  "white",
      TRUE ~ base_color),
    mid_bar_x = map2_dbl(x1, x2,  function(x, y) mean(c(x,y))),
    mid_bar_y = map2_dbl(y1, y2,  function(x, y) mean(c(x,y)))
  )
## Parsed with column specification:
## cols(
##   label = col_character(),
##   x1 = col_double(),
##   x2 = col_double(),
##   y1 = col_double(),
##   y2 = col_double()
## )
padding_size = 0.35
end_of_box <- sort(read_type_schematic_data$x2,partial=length(read_type_schematic_data$x2)-1)[length(read_type_schematic_data$x2)-1] + padding_size
read_type_schematic <- ggplot(read_type_schematic_data, 
                              aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2, 
                                  fill = fill_color, color = border_color)) +
  geom_rect() +
  
  geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +   
  geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
                color = text_color), 
            size = 4,
            fontface = "bold") +
  geom_rect(aes(xmin = min(x1 - padding_size), 
                ymin = 2.5, 
                xmax = end_of_box,
                ymax=max(y2 + padding_size)), 
            fill = NA, color = "darkgrey", size = 0.25) +
  scale_fill_identity() +
  scale_color_identity() +
  theme_void() + 
  theme(
    panel.grid.major.x = element_line(color="lightgrey", size=0.2),
    ) + 
 scale_x_continuous(
   breaks = seq(min(read_type_schematic_data$x1) - padding_size + 0.25, end_of_box , 0.25))

read_type_schematic

Generate schematic defining read type fractions

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_labels <- c("Unmapped",  "Duplicate", "Non-exonic") 
complement_stat_names <- c("Mapped reads",  "Non-duplicate reads", "Exonic reads") 
divisor_name = c("total", "mapped", "non-duplicate")

comparison_of_remaining <- read_type_fractions_long %>% 
  mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
  group_by(read_type_fraction_name) %>%
  summarize(min = round(1-max(dataset_value), 2),
    max = round(1-min(dataset_value), 2),
    median = round(1-median(dataset_value), 2),
    p25 = round(1-quantile(dataset_value, 0.25), 2),
    p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
#      mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x͂=",100* median, "%)"))
mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; med.=", 100* median, "%)"))

positive_bars <- comparison_of_remaining %>%
  select(-min, -max, -p25, -p75) %>%
  mutate(read_type_fraction_label = read_type_fraction_name, 
         position = "1") %>%
  rename(abs_median = median)

negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
                        read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
                        bar_label = read_type_fraction_label,
                        abs_median = 1-positive_bars$abs_median,
                        position = "2")

total_bar <- tibble(read_type_fraction_name = "Total reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = "Total reads\n(100% of reads)",
                    abs_median = 1,
                    position = "1"
                    )


MEND_stats_of_total <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_MEND_of_total = (Total_reads - MEND) / Total_reads
  ) %>%
  ungroup %>% 
  summarize(min = round(1-max(pct_MEND_of_total), 2),
    max = round(1-min(pct_MEND_of_total), 2),
    median = round(1-median(pct_MEND_of_total), 2))

MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
                   read_type_fraction_label = read_type_fraction_name,
                   bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; med.=",100* median, "%)")),
                   abs_median = 1,
                   position = "1"
)

bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
  
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name, 
                                          levels = bar_names_in_order))

cat_space <- all_bars %>% filter(position == 1) %>%
  arrange(read_type_fraction_name) %>%
  select(read_type_fraction_name, abs_median) %>%
  ungroup %>%
  mutate(lagged1_abs_median = lag(abs_median, default =1),
         lagged2_abs_median = lag(abs_median, n=2, default =1),
         category_space = lagged1_abs_median*lagged2_abs_median
         ) %>%
  select(read_type_fraction_name, category_space)

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")


all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
  mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
                           abs_median[read_type_fraction_label=="Exonic reads"]*
                             category_space[read_type_fraction_label=="Exonic reads"], 
                           abs_median*category_space),
         read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
         position = factor(position, levels = rev(sort(unique(position)))),
         bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
         fill_val = ifelse(position == 1, bar_color, NA),
         border_color_val = bar_color,
         text_color = ifelse(position ==1, "white", "black"),
font_size = ifelse(abs_median<0.1, 3,3.5),
text_angle = ifelse(abs_median<0.1, 90,0)
         ) 

read_type_fractions_schematic

read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=bar_label,
                 color = text_color,
                 size = font_size,
                 angle= text_angle), 
             position = position_stack(vjust = .5),
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) 


ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=read_type_fraction_label,
                 color = text_color,
                 size = font_size,
                 angle = text_angle), 
             position = position_stack(vjust = .5),
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

read_type_fractions_schematic

Figure 2

figure_name <- "f2"

f2 <- plot_grid(read_type_schematic,
read_type_fractions_schematic,
nrow=2,
rel_heights = c(1, 2),
labels = c("A", "B")) +
   draw_label(paste(figure_name, Sys.time()),
              x = 0, y = 0.01, hjust = 0, size = 6) 
# 
# f2 <- plot_grid(left_side,
# right_side,
# ncol = 2) +
#    draw_label(paste(figure_name, Sys.time()),
#               x = 0, y = 0.02, hjust = 0, size = 6) 

f2

ggsave("results/Figure_2.png", f2, height=6, width=5)
ggsave("results/Figure_2.pdf", f2, height=6, width=5)
stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
names(stat_label) <- stat_names

Plot read fractions

colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

# 6 x 11 was a good ratio, but the text was too small

read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name,
                                          levels = stat_names))

read_type_fractions_histogram <-  ggplot(read_type_fractions_long_for_histogram) +
    geom_histogram(aes(x = dataset_value, color = read_type_fraction_name), 
                   fill = "white", stat = StatBin2)  +
  scale_color_manual(values = colors_for_stat_names) +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             scales = "free_y",
             labeller = labeller(
               read_type_fraction_name = stat_label
               )
             ) +
  ylab("Datasets") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
    theme(strip.text.x = element_text(size = 12, face = "bold")) +
    theme(legend.position = "none")

read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate MEND reads as a fraction of total

read_counts_with_MEND_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(pct_MEND_of_total = MEND /Total_reads)

ggplot(read_counts_with_MEND_fractions) + 
  geom_histogram(aes(pct_MEND_of_total), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MEND_pct <- read_counts_with_MEND_fractions %>% 
  ungroup %>%
  mutate(dataset_value = pct_MEND_of_total) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(MEND_pct %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
min max median iqr
0 79 50 31
head(sort(read_counts_with_MEND_fractions$pct_MEND_of_total))
## [1] 0.0000267009 0.0002533711 0.0003651723 0.0003767257 0.0003997294
## [6] 0.0004279584
sum(read_counts_with_MEND_fractions$pct_MEND_of_total<0.25)
## [1] 355
read_counts_with_MEND_fractions %>%
  ungroup %>%
  mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
  tabyl(pct_under_25) %>%
  adorn_totals()
##  pct_under_25    n   percent
##         FALSE 1824 0.8370812
##          TRUE  355 0.1629188
##         Total 2179 1.0000000
read_counts_with_MEND_fractions %>%
  ungroup %>%
  filter(Mapped > 30E6) %>%
  mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
  tabyl(pct_under_25) %>%
  adorn_totals()
##  pct_under_25    n   percent
##         FALSE 1739 0.8368624
##          TRUE  339 0.1631376
##         Total 2078 1.0000000

Show per-cohort distribution of read_type_fractions

figure_name <- "fracs_by_group"

read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
  ungroup %>%
  mutate(boxplot_label = fct_reorder(paste0(as.character(cohort_code), ", n=", n_in_cohort), n_in_cohort))


read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
  geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5)  +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             labeller = labeller(
               read_type_fraction_name = stat_label
             )
  ) +
  ylab("Cohorts") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.x = element_text(size = 12, face = "bold")) +
  theme(legend.position = "none")


read_type_fractions_per_cohort_boxplot

Characterize Cohort 4

read_type_names <- c("Total_reads", "Mapped",  "MND", "MEND")
this_cohort_name <- "EGAD00001003279"

this_cohort_code <- subset(read_counts_with_read_type_fractions, cohort_name == this_cohort_name)$cohort_code[1]

this_cohort_code
## [1] "Cohort_4"

in MS

 read_counts_with_read_type_fractions %>%
  filter(cohort_code == this_cohort_code) %>%
   mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
   tabyl(gt_98) %>%
   add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
##  gt_98   n   percent
##  FALSE  55 0.4330709
##   TRUE  72 0.5669291
##  Total 127 1.0000000
read_counts_with_read_type_fractions %>%
  filter(cohort_code == this_cohort_code) %>%
  filter(pct_dupe_of_mapped > 0.98) %>%
  pull(Total_reads) %>% min
## [1] 178294341

other

 counts_and_meta_long <- counts_and_meta  %>%
  pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value")  %>%
   ungroup %>%
   mutate(read_type_name = factor(read_type_name, levels = read_type_names))

counts_and_meta_long_one_proj <-  counts_and_meta_long %>%
  filter(cohort_code == this_cohort_code)

table(counts_and_meta_long_one_proj$read_type_name)
## 
## Total_reads      Mapped         MND        MEND 
##         127         127         127         127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads. 
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)


summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include, 
                         max_plots_to_include, 
                         n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))

set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)

ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) + 
  geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
  facet_wrap(~dataset_id, ncol = 1, strip.position="right")  +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.y = element_text(angle = 0)) +
  scale_fill_brewer(palette = "Set1")  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort_code)) +
  coord_flip()

ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) + 
  geom_histogram(aes(dataset_value/1e6), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(paste("Distribution of MEND counts in", this_cohort_code))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)                      
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)


ggplot(counts_and_meta %>% 
         filter(cohort_code == this_cohort_code) %>%
         mutate(pct_dupes = (Mapped - MND)/Mapped)) + 
  geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_fill_gradientn(colours = c(bl,"white", re)) +
  ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort_code), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")

counts_and_meta_long_one_proj %>%
  select(seq_length, dataset_id) %>%
  distinct %>%
  tabyl(seq_length)
##  seq_length   n percent
##          51 127       1

Expressed genes

What is the distribution of measured genes?

gene_counts_long <- counts_and_meta %>%
  pivot_longer(cols = starts_with("genes_"), names_to = "Expression_threshold")  %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  


ggplot(gene_counts_long) + 
  geom_histogram(aes(x=value/1e3, fill = Expression_threshold), stat = StatBin) +
  facet_wrap(~Expression_threshold)  +
    theme_minimal() +
  scale_fill_brewer(palette = "Set1") +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
    theme(legend.position = "none") +
  ylab("Datasets") +
  xlab("Measured genes") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are the exact lowest gene counts?

sort(counts_and_meta$genes_expressed_above_0) [1:100]
##   [1]     9     9    12    12    13    14    14    15    16    16    17
##  [12]    18    18    18    18    18    18    19    19    19    19    20
##  [23]    20    20    20    20    20    21    21    21    21    21    21
##  [34]    22    22    22    23    23    23    23    23    24    24    24
##  [45]    24    24    24    24    25    25    25    26    26    26    26
##  [56]    27    27    27    27    28    28    28    28    29    29    29
##  [67]    29    29    30    30    31    31    32    33    35    36    37
##  [78]    40  6098  7107 11830 15323 16793 16837 16943 17233 17531 18288
##  [89] 18588 18704 19626 19693 19749 19781 19942 19963 20063 20120 20203
## [100] 20247

correlations with all datasets

# library(corrr)

counts_and_meta %>% 
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all datasets", digits = 2) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads -0.20 -0.40 -0.40 -0.40 -0.39 -0.39
Mapped -0.19 -0.41 -0.41 -0.41 -0.40 -0.39
MND 0.51 0.24 0.20 0.18 0.17 0.16
MEND 0.48 0.27 0.26 0.26 0.26 0.26

correlations with all but low gene count datasets

counts_and_meta %>% 
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all but low gene count datasets", digits = 2) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all but low gene count datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads 0.13 -0.26 -0.28 -0.28 -0.28 -0.28
Mapped 0.21 -0.25 -0.26 -0.27 -0.27 -0.27
MND 0.51 0.04 0.02 0.00 0.00 0.00
MEND 0.44 0.08 0.09 0.11 0.11 0.12

Plot correlations betweeen gene counts and read types

Groom data

expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
  pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
  pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
  mutate(Read_type = factor(Read_type, levels = read_type_names))

Identify datasets with unusual gene counts or read depths

datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
  pull(dataset_id)

datasets_with_outlier_gene_counts <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(genes_expressed_above_0)) %>%
  pull(dataset_id) 

datasets_with_outlier_depths <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(Total_reads)) %>%
  pull(dataset_id) 

Function for calculating gene count correlations and plotting results

plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){

this_data <- this_data %>% 
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  
  
  
these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2)) 

# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none") +
  xlab("Read counts (million)") +
  ylab("Gene counts (thousand)")

}

Figure S2

fs2 <- plot_gene_count_correlations(expr_gene_and_read_counts_to_plot,
                             "Correlations calculated across all datasets")

fs2
## `geom_smooth()` using formula 'y ~ x'

ggsave("results/Figure_S2.png", fs2 + ggtitle(""))
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
ggsave("results/Figure_S2.pdf", fs2 + ggtitle(""))
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'

for all datasets

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
  paste("Correlations in datasets with more than", min_genes_gt_0, "genes expressed above 0"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_gene_counts),
  paste("Correlations calculated across all but outlier gene count datasets"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths),
  paste("Correlations calculated across all but outlier read depth datasets"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths, 
         ! dataset_id %in% datasets_with_outlier_gene_counts
         ),
  paste("Correlations calculated across all but datasets with outlier read depth or gene count"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(
  expr_gene_and_read_counts_to_plot %>% 
    filter(disease ==  "acute lymphoblastic leukemia"),
  "Correlations calculated among acute lymphoblastic leukemia datasets")
## `geom_smooth()` using formula 'y ~ x'

corr plot for figure

max_total_read_depth <- 250*1e6
print(min_genes_gt_0)
## [1] 100
sum(counts_and_meta$genes_expressed_above_0 <= min_genes_gt_0)
## [1] 78
sum(counts_and_meta$Total_reads > max_total_read_depth)
## [1] 105
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
  filter(Total_reads > max_total_read_depth) %>%
  pull(dataset_id)

exclude_for_gene_count <- counts_and_meta %>%
  filter(genes_expressed_above_0 < min_genes_gt_0) %>%
  pull(dataset_id)


this_data <- expr_gene_and_read_counts_to_plot %>% 
      filter(
        Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
       ! dataset_id %in% exclude_for_depth, 
        ! dataset_id %in% exclude_for_gene_count
      ) %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  

n_distinct(this_data$dataset_id)
## [1] 1996
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")

these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2))

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5),
        strip.text = element_text(face = "bold")) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  theme(legend.position="none") +
  scale_x_continuous("Read counts (million)", n.breaks = 4) +
  ylab("Gene counts (thousand)") 


cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'

Figure 3

f3a <- read_type_fractions_histogram +
            theme(strip.text.x = element_text(size = 10, face = "bold"),
                  axis.title.x=element_blank(),
                  axis.text.x=element_blank())

f3b <- read_type_fractions_per_cohort_boxplot +
            theme(strip.text.x = element_blank(),
                  strip.background = element_blank(),
                  strip.text = element_blank())

f3ab <- plot_grid(f3a, 
                  f3b,
                  ncol = 1,
                  rel_heights = c(1.5, 6),
                  align = "v",
                  labels = "AUTO")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f3c <- plot_grid(cor_plots_excluding_high_read_depth_and_low_gene_count_datasets,
          labels = "C")
## `geom_smooth()` using formula 'y ~ x'
f3 <- plot_grid(f3ab,
                 f3c,
                 nrow=1,
                 rel_widths = c(4,2.3))

figure_name <- "f3"
f3_labeled <- f3 +   draw_label(paste(figure_name, Sys.time()),
             x = 0, y = 0.02, hjust = 0, size = 6) 


f3_labeled

ggsave("results/Figure_3.png", f3_labeled, height=8, width=10)
ggsave("results/Figure_3.pdf", f3_labeled, height=8, width=10)

Plot sequence length against pct duplicates

ggplot(read_counts_with_read_type_fractions) + 
  geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))

ggplot(read_counts_with_read_type_fractions) + 
  geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))

Generate bibtex formatted citations for packages

used_pkgs <- c("base", "tidyverse", "janitor", "knitr", "cowplot", "RColorBrewer", "pander", "kableExtra", "snakecase", "corrr")
write_bib(used_pkgs)
## @Manual{R-base,
##   title = {R: A Language and Environment for Statistical Computing},
##   author = {{R Core Team}},
##   organization = {R Foundation for Statistical Computing},
##   address = {Vienna, Austria},
##   year = {2019},
##   url = {https://www.R-project.org/},
## }
## @Manual{R-corrr,
##   title = {corrr: Correlations in R},
##   author = {Edgar Ruiz and Simon Jackson and Jorge Cimentada},
##   year = {2019},
##   note = {R package version 0.4.0},
##   url = {https://CRAN.R-project.org/package=corrr},
## }
## @Manual{R-cowplot,
##   title = {cowplot: Streamlined Plot Theme and Plot Annotations for 'ggplot2'},
##   author = {Claus O. Wilke},
##   year = {2019},
##   note = {R package version 1.0.0},
##   url = {https://CRAN.R-project.org/package=cowplot},
## }
## @Manual{R-janitor,
##   title = {janitor: Simple Tools for Examining and Cleaning Dirty Data},
##   author = {Sam Firke},
##   year = {2019},
##   note = {R package version 1.2.0},
##   url = {https://CRAN.R-project.org/package=janitor},
## }
## @Manual{R-kableExtra,
##   title = {kableExtra: Construct Complex Table with 'kable' and Pipe Syntax},
##   author = {Hao Zhu},
##   year = {2019},
##   note = {R package version 1.1.0},
##   url = {https://CRAN.R-project.org/package=kableExtra},
## }
## @Manual{R-knitr,
##   title = {knitr: A General-Purpose Package for Dynamic Report Generation in R},
##   author = {Yihui Xie},
##   year = {2019},
##   note = {R package version 1.25},
##   url = {https://CRAN.R-project.org/package=knitr},
## }
## @Manual{R-pander,
##   title = {pander: An R 'Pandoc' Writer},
##   author = {Gergely Daróczi and Roman Tsegelskyi},
##   year = {2018},
##   note = {R package version 0.6.3},
##   url = {https://CRAN.R-project.org/package=pander},
## }
## @Manual{R-RColorBrewer,
##   title = {RColorBrewer: ColorBrewer Palettes},
##   author = {Erich Neuwirth},
##   year = {2014},
##   note = {R package version 1.1-2},
##   url = {https://CRAN.R-project.org/package=RColorBrewer},
## }
## @Manual{R-snakecase,
##   title = {snakecase: Convert Strings into any Case},
##   author = {Malte Grosser},
##   year = {2019},
##   note = {R package version 0.11.0},
##   url = {https://CRAN.R-project.org/package=snakecase},
## }
## @Manual{R-tidyverse,
##   title = {tidyverse: Easily Install and Load the 'Tidyverse'},
##   author = {Hadley Wickham},
##   year = {2017},
##   note = {R package version 1.2.1},
##   url = {https://CRAN.R-project.org/package=tidyverse},
## }

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

SessionInfo

pander(sessionInfo(), compact = FALSE)

R version 3.6.0 (2019-04-26)

Platform: x86_64-apple-darwin15.6.0 (64-bit)

locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8

attached base packages:

  • stats
  • graphics
  • grDevices
  • utils
  • datasets
  • methods
  • base

other attached packages:

  • corrr(v.0.4.0)
  • snakecase(v.0.11.0)
  • kableExtra(v.1.1.0)
  • pander(v.0.6.3)
  • RColorBrewer(v.1.1-2)
  • cowplot(v.1.0.0)
  • knitr(v.1.25)
  • janitor(v.1.2.0)
  • forcats(v.0.4.0)
  • stringr(v.1.4.0)
  • dplyr(v.0.8.5)
  • purrr(v.0.3.3)
  • readr(v.1.3.1)
  • tidyr(v.1.0.0)
  • tibble(v.3.0.1)
  • ggplot2(v.3.3.0)
  • tidyverse(v.1.2.1)

loaded via a namespace (and not attached):

  • tidyselect(v.1.1.0)
  • xfun(v.0.10)
  • lattice(v.0.20-38)
  • splines(v.3.6.0)
  • haven(v.2.1.1)
  • colorspace(v.1.4-1)
  • vctrs(v.0.3.0)
  • generics(v.0.0.2)
  • htmltools(v.0.4.0)
  • viridisLite(v.0.3.0)
  • mgcv(v.1.8-29)
  • yaml(v.2.2.0)
  • rlang(v.0.4.6)
  • pillar(v.1.4.3)
  • glue(v.1.4.1)
  • withr(v.2.1.2)
  • modelr(v.0.1.5)
  • readxl(v.1.3.1)
  • lifecycle(v.0.2.0)
  • munsell(v.0.5.0)
  • gtable(v.0.3.0)
  • cellranger(v.1.1.0)
  • rvest(v.0.3.4)
  • evaluate(v.0.14)
  • labeling(v.0.3)
  • fansi(v.0.4.0)
  • highr(v.0.8)
  • broom(v.0.7.0)
  • Rcpp(v.1.0.3)
  • scales(v.1.1.0)
  • backports(v.1.1.5)
  • webshot(v.0.5.1)
  • jsonlite(v.1.6)
  • farver(v.2.0.1)
  • hms(v.0.5.1)
  • digest(v.0.6.23)
  • stringi(v.1.4.3)
  • grid(v.3.6.0)
  • cli(v.2.0.0)
  • tools(v.3.6.0)
  • magrittr(v.1.5)
  • crayon(v.1.3.4)
  • pkgconfig(v.2.0.3)
  • Matrix(v.1.2-17)
  • ellipsis(v.0.3.0)
  • xml2(v.1.2.2)
  • lubridate(v.1.7.4)
  • assertthat(v.0.2.1)
  • rmarkdown(v.1.16)
  • httr(v.1.4.1)
  • rstudioapi(v.0.10)
  • R6(v.2.4.1)
  • nlme(v.3.1-141)
  • compiler(v.3.6.0)